# -*- coding: utf-8 -*-
'''
Created on Mar 24, 2013

@author: LONG HOANG GIANG
'''

import urllib, json, os, gzip, cStringIO as StringIO
from lxml import etree

def getHtml(url):
    return etree.parse(StringIO.StringIO(urllib.urlopen(url).read()), parser=etree.HTMLParser(encoding='utf-8'))
    
def process(url):
    tree = getHtml(url)
    data = []
    for item in tree.xpath("//table[@class='mytable']//th[1]/a"):
        link = 'http://manga24h.com/' + item.get('href')
        title = item.xpath("./text()")[0].strip()
        print '--------------------------------'
        print title, link
        print '--------------------------------'
        itree = getHtml(link)
        images = []
        for image in itree.xpath("//ul[@id='portfolio']/li[position()>1]/img"):
            src = image.get('src', '').strip()
            if src == '': continue
            print src
            images.append(src)
        data.append({'chapter': title, 'images': images})
    
    global OUTPUT
    if OUTPUT[-1] != '': OUTPUT += '/'
    if not os.path.isdir(OUTPUT):
        os.makedirs(OUTPUT, 0777)
    fp = gzip.open(OUTPUT + 'comic.data', 'wb')
    json.dump(data, fp)
    fp.close()
    

if __name__ == '__main__':
    
    
    URL = 'http://manga24h.com/29/O-Long-Vien.html'
    OUTPUT = '/longhoanggiang/comic/' + 'olongvien'
    # process
    process(URL)
    
    print '> Finished'
    os._exit(1)
    
    
    
    